"""
compute inventor pair node deg distance measures based on co-invention experience before a given year.

1. get_inventor: construct a social network graph using all co-invention experience before a given year t,
2. load_simscore: 
    use sim_score_1981_2015_top5.csv
    get a dataframe with cross product between the set of citing patents inventors and cited patents inventors. 
3. compute shortest path between each inventor pair from step 2; aggregate on patent pair level.

Date: 2023-03-29
Author: Shaoyu Liu

example code:
    cd /Volumes/Zihao_SSD2/     
    python3 code/inventor_deg_dist.py \
    --path_inventor PatentsView/g_inventor_disambiguated.tsv \
    --path_patent PatentsView/g_patent.tsv \
    --year 1981 \
    --output_dir PatentsView/network_results/

sbatch inventor_deg_dist.sh

"""

# coauthorship network
import pandas as pd
import numpy as np
import networkx as nx
import os
import logging
import argparse


def get_inventor(path_inventor, path_patent, year):
    """
    Args:
        path_inventor: path to g_inventor_disambiguated.tsv
        path_patent: path to g_patent.tsv
        year: graph for a specified year (using all coauthor network before that year)
    Returns:

    """
    df = pd.read_csv(path_inventor, delimiter="\t", usecols=["patent_id", "inventor_id", "location_id"])
    df_patent = pd.read_csv(path_patent, delimiter="\t", usecols=["patent_id", "patent_date"])
    df_patent.rename(columns={"patent_date": "year"}, inplace=True)
    df_patent["year"] = df_patent["year"].str[:4].astype("int16")

    df = df.merge(df_patent, on=["patent_id"], how="inner")
    min_yr = df["year"].min()
    logger.info(f"min year:{min_yr}")
    df = df[df["year"] < year]

    df2 = df[["patent_id", "inventor_id"]].merge(
        df[["patent_id", "inventor_id"]], on=["patent_id"]
    )
    logger.info(len(df2))

    logger.info("drop self-loops")
    df2 = df2[df2["inventor_id_x"] != df2["inventor_id_y"]]
    logger.info(len(df2))

    df2 = df2.groupby(["inventor_id_x", "inventor_id_y"]).size().reset_index()
    df2.columns = ["inventor_id_x", "inventor_id_y", "weight"]

    # construct a graph
    # each row being a pair of inventors (inventor A and inventor B) co-occur in a patent.
    G = nx.Graph()
    G = nx.from_pandas_edgelist(
        df2, source="inventor_id_x", target="inventor_id_y", edge_attr=["weight"]
    )
    logger.info(f"number of edges: {G.number_of_edges()}")
    logger.info(f"number of nodes: {G.number_of_nodes()}")

    # get isolated nodes
    set_isolated = tuple(set(df["inventor_id"]).difference(set(df2["inventor_id_x"])))
    logger.info(f"number of isolated inventors: {len(set_isolated)}")

    G.add_node(set_isolated)

    return G


def load_simscore(year):
    """
    load patent similarity scores for year and get
    a dataframe with cross product between the set of citing patents inventors and cited patents inventors
    ['patent_id_x', 'patent_year_x', 'patent_id_y', 'inventor_sequence_x',
    'inventor_id_x', 'inventor_sequence_y', 'inventor_id_y',
    'node_distance']
    """
    df_simscore = pd.read_csv("/Volumes/Zihao_SSD2/PatentsView/cleandata/sim_score_1981_2015_top5.csv", usecols=["patent_id", "cited_patent_id", "patent_year"], dtype={"patent_year": "int16"})
    df_simscore = df_simscore.rename(columns={
        'patent_id': 'patent_id_x',
        'cited_patent_id': 'patent_id_y',
        'patent_year': 'patent_year_x'
    })
    df_simscore = df_simscore[df_simscore["patent_year_x"] == year]

    # load only patents that have numeric IDs
    df_inventor = pd.read_csv(
        "/Volumes/Zihao_SSD2/PatentsView/rawdata/g_inventor_disambiguated.tsv",
        delimiter="\t",
        nrows=18751599,
        usecols=["patent_id", "inventor_sequence", "inventor_id"],
        dtype={"patent_id": int, "inventor_sequence": "uint16"},
    )
    df_inventor_cited = df_inventor.copy()
    df_inventor.columns = ["patent_id_x", "inventor_sequence_x", "inventor_id_x"]
    df_inventor_cited.columns = ["patent_id_y", "inventor_sequence_y", "inventor_id_y"]

    df_simscore_inventors = df_simscore.merge(df_inventor, on=["patent_id_x"]).merge(
        df_inventor_cited, on=["patent_id_y"]
    )
    df_simscore_inventors.sort_values(["patent_id_x", "patent_id_y"], inplace=True)
    df_simscore_inventors = df_simscore_inventors.reset_index(drop=True)

    logger.info(
        f"cross product between the set of citing patents inventors and cited patents inventors:{len(df_simscore_inventors)}"
    )
    logger.info("simscore loading done.")
    return df_simscore, df_simscore_inventors


def compute_dist(G, df_simscore, df_simscore_inventors, out_dir, year) -> None:

    import tqdm

    # iterate through the cross inventor df - use zipped columns to speed up
    logger.info("start computing inventor pair distance")
    node_dist_list = []

    rows = zip(
        df_simscore_inventors["inventor_id_x"], df_simscore_inventors["inventor_id_y"]
    )
    for i, (citer, cited) in tqdm.tqdm(enumerate(rows)):
        if citer in G.nodes and cited in G.nodes:
            try:
                node_dist = nx.shortest_path_length(
                    G=G, source=citer, target=cited
                )  # O(ElogV)
            except nx.NetworkXNoPath:
                node_dist = None
        else:
            node_dist = None
        node_dist_list.append(node_dist)

    logger.info("computing inventor pair distance done! ")

    df_simscore_inventors["node_distance"] = node_dist_list

    # minimum node distance among all inventors in citing and cited patents
    df_simscore_inventors_all = (
        df_simscore_inventors.groupby(["patent_id_x", "patent_id_y"])["node_distance"]
        .min()
        .reset_index()
    )
    df_simscore_inventors_all.rename(
        columns={"node_distance": "node_dist_all"}, inplace=True
    )
    # node distance between first inventors between citing and cited patents.
    df_simscore_inventors_first = df_simscore_inventors[
        (
            df_simscore_inventors["inventor_sequence_x"]
            == df_simscore_inventors["inventor_sequence_y"]
        )
        & (df_simscore_inventors["inventor_sequence_x"] == 0)
    ][["patent_id_x", "patent_id_y", "node_distance"]]
    df_simscore_inventors_first.rename(
        columns={"node_distance": "node_dist_first"}, inplace=True
    )

    df_simscore = df_simscore.merge(
        df_simscore_inventors_first, on=["patent_id_x", "patent_id_y"], how="left"
    ).merge(df_simscore_inventors_all, on=["patent_id_x", "patent_id_y"], how="left")

    del df_simscore_inventors_first
    del df_simscore_inventors_all
    del df_simscore_inventors

    out_path = os.path.join(out_dir, f"node_dist_{year}.csv")

    df_simscore.to_csv(out_path, index=False)


def main(path_inventor, path_patent, year, output_dir):
    """
    ...
    """
    print(year)
    G = get_inventor(path_inventor, path_patent, year)

    df_simscore, df_simscore_inventors = load_simscore(year)
    compute_dist(
        G=G,
        df_simscore=df_simscore,
        df_simscore_inventors=df_simscore_inventors,
        out_dir=output_dir,
        year=year,
    )


if __name__ == "__main__":
    os.chdir(r'/Volumes/Zihao_SSD2/PatentsView/')

    ap = argparse.ArgumentParser()
    ap.add_argument("--path_inventor", help="specify path to inventor")
    ap.add_argument("--path_patent", help="specify path to patent")
    ap.add_argument("--year", type=int, help="patent year to compute degree distance")
    ap.add_argument("--output_dir")
    args = ap.parse_args()

    logging.basicConfig(
        filename=os.path.join("logs", f"deg_dist_{args.year}.log"),
        format="%(asctime)s:%(levelname)s:%(message)s",
        level=logging.INFO,
    )
    logger = logging.getLogger(__name__)

    main(
        path_inventor=args.path_inventor,
        path_patent=args.path_patent,
        year=args.year,
        output_dir=args.output_dir,
    )
